In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white")

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import general
from helper import pca

load data


In [6]:
mat = sio.loadmat('./data/ex7data1.mat')
X = mat.get('X')

# visualize raw data
print(X.shape)

sns.lmplot('X1', 'X2', 
           data=pd.DataFrame(X, columns=['X1', 'X2']),
           fit_reg=False)


(50, 2)
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x11264a748>

normalize data


In [3]:
X_norm = pca.normalize(X)

sns.lmplot('X1', 'X2', 
           data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
           fit_reg=False)


Out[3]:
<seaborn.axisgrid.FacetGrid at 0x106c79668>

covariance matrix $\Sigma$

this is biased sample covariance matrix, for unbiased version, you need to divide it by $m-1$


In [4]:
Sigma = pca.covariance_matrix(X_norm)  # capital greek Sigma
Sigma  # (n, n)


Out[4]:
array([[ 1.        ,  0.73553038],
       [ 0.73553038,  1.        ]])

In [12]:
U, S, V = pca.pca(X_norm)

In [13]:
U


Out[13]:
array([[-0.70710678, -0.70710678],
       [-0.70710678,  0.70710678]])

In [7]:
u1 = U[0]
u1


Out[7]:
array([-0.70710678, -0.70710678])

project data to lower dimension


In [8]:
# show top 10 projected data
Z = pca.project_data(X_norm, U, 1)
Z[:10]


Out[8]:
array([[ 1.49631261],
       [-0.92218067],
       [ 1.22439232],
       [ 1.64386173],
       [ 1.2732206 ],
       [-0.97681976],
       [ 1.26881187],
       [-2.34148278],
       [-0.02999141],
       [-0.78171789]])

In [16]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))

sns.regplot('X1', 'X2', 
           data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
           fit_reg=False,
           ax=ax1)
ax1.set_title('Original dimension')

sns.rugplot(Z, ax=ax2)
ax2.set_xlabel('Z')
ax2.set_title('Z dimension')


Out[16]:
<matplotlib.text.Text at 0x1163d08d0>

recover data to original dimension

Of course, there would be inevitable information loss if you boost data from lower to higher dimension


In [17]:
X_recover = pca.recover_data(Z, U)

fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(12, 4))

sns.rugplot(Z, ax=ax1)
ax1.set_title('Z dimension')
ax1.set_xlabel('Z')

sns.regplot('X1', 'X2', 
           data=pd.DataFrame(X_recover, columns=['X1', 'X2']),
           fit_reg=False,
           ax=ax2)
ax2.set_title("2D projection from Z")

sns.regplot('X1', 'X2', 
           data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
           fit_reg=False,
           ax=ax3)
ax3.set_title('Original dimension')


Out[17]:
<matplotlib.text.Text at 0x1166f0e80>

the projection from (X1, X2) to Z could be visualized like this


In [ ]: